Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix RFCs/411: add std/cputicks #18743

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@

- Fixed buffer overflow bugs in `net`

- Added `std/cputicks` containing APIs for CPU counters with highest performance available.

- Exported `sslHandle` from `net` and `asyncnet`.

- Added `sections` iterator in `parsecfg`.
Expand Down
4 changes: 4 additions & 0 deletions compiler/vmops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ from sighashes import symBodyDigest

# There are some useful procs in vmconv.
import vmconv
from std/cputicks import getCpuTicks

template mathop(op) {.dirty.} =
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
Expand Down Expand Up @@ -340,3 +341,6 @@ proc registerAdditionalOps*(c: PCtx) =
let p = a.getVar(0)
let x = a.getFloat(1)
addFloatSprintf(p.strVal, x)

registerCallback c, "stdlib.cputicks.getCpuTicksImpl", proc(a: VmArgs) =
setResult(a, getCpuTicks())
3 changes: 3 additions & 0 deletions doc/lib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ String handling
Time handling
-------------

* `cputicks <cputicks.html>`_
The `cputicks` module contains APIs for high performance CPU counters.

* `monotimes <monotimes.html>`_
The `monotimes` module implements monotonic timestamps.

Expand Down
113 changes: 113 additions & 0 deletions lib/std/cputicks.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
##[
Experimental API, subject to change.
]##

#[
Future work:
* convert ticks to time; see some approaches here: https://quick-bench.com/q/WcbqUWBCoNBJvCP4n8h3kYfZDXU
* provide feature detection to test whether the CPU supports it (on linux, via /proc/cpuinfo)
* test on ARMv8-A, ARMv8-M, arm64

## js
* we use `window.performance.now()`

## nodejs
* we use `process.hrtime.bigint()`

## ARM
* The ARMv8-A architecture[1] manual explicitly states that two reads to the PMCCNTR_EL0 register may return the same value[1a].
There is also the CNTVCT_EL0[1b] register, however it's unclear whether that register is even monotonic (it's implied, but not stated explicitly).
The ARMv8-M architecture[2] has the CYCCNT register, however all that's mentioned is that it is an "optional free-running 32-bit cycle counter"[2a].

## references
[1] https://documentation-service.arm.com/static/611fa684674a052ae36c7c91
[1a] See [1], PDF page 2852
[2] https://documentation-service.arm.com/static/60e6f8573d73a34b640e0cee
[2a] See [2]. PDF page 367

## further links
* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/ia-32-ia-64-benchmark-code-execution-paper.pdf
* https://gist.github.com/savanovich/f07eda9dba9300eb9ccf
* https://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched#
]#

when defined(js):
proc getCpuTicksImpl(): int64 =
## Returns ticks in nanoseconds.
# xxx consider returning JsBigInt instead of float
when defined(nodejs):
{.emit: """
let process = require('process');
`result` = Number(process.hrtime.bigint());
""".}
else:
proc jsNow(): int64 {.importjs: "window.performance.now()".}
result = jsNow() * 1_000_000
else:
const header =
when defined(posix): "<x86intrin.h>"
else: "<intrin.h>"
proc getCpuTicksImpl(): uint64 {.importc: "__rdtsc", header: header.}

template getCpuTicks*(): int64 =
## Returns number of CPU ticks as given by a platform specific timestamp counter,
## oftentimes the `RDTSC` instruction.
## Unlike `std/monotimes.ticks`, this gives a strictly monotonic counter at least
## on recent enough x86 platforms, and has higher resolution and lower overhead,
## allowing to measure individual instructions (corresponding to time offsets in
## the nanosecond range). A best effort implementation is provided when a timestamp
## counter is not available.
##
## Note that the CPU may reorder instructions.
runnableExamples:
for i in 0..<100:
let t1 = getCpuTicks()
# code to benchmark can go here
let t2 = getCpuTicks()
assert t2 > t1
cast[int64](getCpuTicksImpl())
timotheecour marked this conversation as resolved.
Show resolved Hide resolved

template toInt64(a, b): untyped =
cast[int64](cast[uint64](a) or (cast[uint64](d) shl 32))

proc getCpuTicksStart*(): int64 {.inline.} =
## Variant of `getCpuTicks` which uses the `RDTSCP` instruction. Compared to
## `getCpuTicks`, this avoids introducing noise in the measurements caused by
## CPU instruction reordering, and can result in more deterministic results,
## at the expense of extra overhead and requiring asymetric start/stop APIs.
##
## A best effort implementation is provided for platforms where `RDTSCP` is
## not available.
runnableExamples:
var a = 0
for i in 0..<100:
let t1 = getCpuTicksStart()
# code to benchmark can go here
let t2 = getCpuTicksEnd()
assert t2 > t1, $(t1, t2)
when nimvm: result = getCpuTicks()
else:
when defined(js): result = getCpuTicks()
else:
var a {.noinit.}: cuint
var d {.noinit.}: cuint
# See https://developers.redhat.com/blog/2016/03/11/practical-micro-benchmarking-with-ltrace-and-sched
{.emit:"""
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
asm volatile("rdtsc" : "=a" (a), "=d" (d));
""".}
result = toInt64(a, b)

proc getCpuTicksEnd*(): int64 {.inline.} =
## See `getCpuTicksStart <#getCpuTicksStart>`_
when nimvm: result = getCpuTicks()
else:
when defined(js): result = getCpuTicks()
else:
var a {.noinit.}: cuint
var d {.noinit.}: cuint
{.emit:"""
asm volatile("rdtscp" : "=a" (a), "=d" (d));
asm volatile("cpuid" ::: "%rax", "%rbx", "%rcx", "%rdx");
""".}
result = toInt64(a, b)
1 change: 1 addition & 0 deletions lib/std/monotimes.nim
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ that the actual supported time resolution differs for different systems.
See also
========
* `times module <times.html>`_
* `cputicks module <times.html>`_ which provides cpu counters with highest available performance
]##

import times
Expand Down
22 changes: 22 additions & 0 deletions tests/stdlib/tcputicks.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
discard """
targets: "c cpp js"
matrix: "; -d:danger"
"""

import std/cputicks

template main =
let n = 100
for i in 0..<n:
let t1 = getCpuTicks()
let t2 = getCpuTicks()
doAssert t2 > t1

for i in 0..<100:
let t1 = getCpuTicksStart()
# code to benchmark can go here
let t2 = getCpuTicksEnd()
doAssert t2 > t1

static: main()
main()