Skip to content

Commit

Permalink
Add ip_prefix_collapse function [7/n] (#11778)
Browse files Browse the repository at this point in the history
Summary:

Given a list of ipaddresses that are either all ipv4 or ipv6, return the minimal CIDR representation.  The list cannot compare nulls.

This is done by 
1.  sorting the ipaddress list
2. merging the sorted ranges 
3. create the min ipprefix for each range

Java implementation: https://github.com/prestodb/presto/blob/9f490e75e905e8d107b3e469cc146dace034ae7c/presto-main/src/main/java/com/facebook/presto/operator/scalar/IpPrefixFunctions.java#L214

Differential Revision: D65982872
  • Loading branch information
yuandagits authored and facebook-github-bot committed Feb 13, 2025
1 parent e274d16 commit 3e8a924
Show file tree
Hide file tree
Showing 3 changed files with 478 additions and 0 deletions.
10 changes: 10 additions & 0 deletions velox/docs/functions/presto/ipaddress.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,13 @@ IP Functions
SELECT is_subnet_of(IPPREFIX '192.168.3.131/26', IPPREFIX '192.168.3.144/30'); -- true
SELECT is_subnet_of(IPPREFIX '64:ff9b::17/64', IPPREFIX '64:ffff::17/64'); -- false
SELECT is_subnet_of(IPPREFIX '192.168.3.131/26', IPPREFIX '192.168.3.131/26'); -- true

.. function:: ip_prefix_collapse(array(ip_prefix)) -> array(ip_prefix)

Returns the minimal CIDR representation of the input ``IPPREFIX`` array. Every ``IPPREFIX``
in the input array must be the same IP version (that is, only IPv4 or only IPv6) or the
query will fail and raise an error. ::

SELECT IP_PREFIX_COLLAPSE(ARRAY[IPPREFIX '192.168.0.0/24', IPPREFIX '192.168.1.0/24']); -- [{192.168.0.0/23}]
SELECT IP_PREFIX_COLLAPSE(ARRAY[IPPREFIX '2620:10d:c090::/48', IPPREFIX '2620:10d:c091::/48']); -- [{2620:10d:c090::/47}]
SELECT IP_PREFIX_COLLAPSE(ARRAY[IPPREFIX '192.168.1.0/24', IPPREFIX '192.168.0.0/24', IPPREFIX '192.168.2.0/24', IPPREFIX '192.168.9.0/24']); -- [{192.168.0.0/23}, {192.168.2.0/24}, {192.168.9.0/24}]
224 changes: 224 additions & 0 deletions velox/functions/prestosql/IPAddressFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,228 @@ struct IPSubnetOfFunction {
}
};

template <typename T>
struct IPPrefixCollapseFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE void call(
out_type<Array<IPPrefix>>& result,
const arg_type<Array<IPPrefix>>& ipPrefixes) {
if (ipPrefixes.size() == 0) {
return;
}

std::vector<std::tuple<int128_t, int8_t>> prefixes;
prefixes.reserve(ipPrefixes.size());

for (const auto& ipPrefix : ipPrefixes) {
if (ipPrefix.has_value()) {
prefixes.push_back(std::make_tuple(
*ipPrefix->template at<0>(), *ipPrefix->template at<1>()));
} else {
// ip_prefix_collapse does not support null elements. Thus we throw here
// with the same error message as Presto java.
VELOX_USER_FAIL("ip_prefix_collapse does not support null elements");
}
}

std::sort(
prefixes.begin(), prefixes.end(), [](const auto& a, const auto& b) {
// First compare by the first tuple to see if we can order the
// ipaddresses.
auto ipCompare = IPADDRESS()->compare(std::get<0>(a), std::get<0>(b));
if (ipCompare != 0) {
return ipCompare < 0;
}

// Compare the prefix bits if the ip addresses are the same.
return std::get<1>(a) < std::get<1>(b);
});

// If the length of the prefixes is 1 and it is not null, we can simply
// return.
if (prefixes.size() == 1) {
writeIpPrefix(result, prefixes);
return;
}

// All IPAddresses must be the same IP version
const bool isFirstIpV4 = isIPv4(std::get<0>(prefixes.front()));
const bool isLastIpV4 = isIPv4(std::get<0>(prefixes.back()));
if (isFirstIpV4 != isLastIpV4) {
VELOX_USER_FAIL("All IPPREFIX elements must be the same IP version.");
}

auto mergedRanges = mergeIpRanges(prefixes);
const auto ipMaxBitLength =
isFirstIpV4 ? ipaddress::kIPV4Bits : ipaddress::kIPV6Bits;

for (auto& range : mergedRanges) {
writeIpPrefix(
result,
generateMinIpPrefixes(
std::get<0>(range), std::get<1>(range), ipMaxBitLength));
}
}

private:
FOLLY_ALWAYS_INLINE static void writeIpPrefix(
exec::ArrayWriter<IPPrefix>& writer,
std::vector<std::tuple<int128_t, int8_t>> ipprefixes) {
for (auto& ipprefix : ipprefixes) {
writer.add_item() = std::move(ipprefix);
}
}

FOLLY_ALWAYS_INLINE static int64_t bitLength(int128_t num) {
// Handle the case when the number is zero
if (num == 0) {
return 0;
}

// Work with the absolute value of the number
uint128_t abs_num =
(num < 0) ? static_cast<uint128_t>(-num) : static_cast<uint128_t>(num);

// Find the position of the highest bit using logarithm (base 2)
return static_cast<int64_t>(std::log2(abs_num)) + 1;
}

FOLLY_ALWAYS_INLINE static int64_t getLowestSetBit(int128_t x) {
if (x == 0) {
return -1; // No set bits
}

// Check the lower 64 bits
static constexpr uint64_t mask = 0xFFFFFFFFFFFFFFFF;
if (x & mask) {
return __builtin_ctzll(x & mask);
}

// Check the upper 64 bits
return __builtin_ctzll(x >> 64) + 64;
}

FOLLY_ALWAYS_INLINE static int64_t findRangeBits(
int128_t firstIpAddress,
int128_t lastIpAddress) {
// The number of IP addresses in the range
constexpr int128_t kOne = 1;
const int128_t ipCount = lastIpAddress - firstIpAddress + kOne;

// We have two possibilities for determining the right prefix boundary

// Case 1. Find the largest possible prefix that firstIpAddress can be.
// Say we have an input range of 192.168.0.0 to 192.184.0.0.
// The number of IP addresses in the range is 1048576 = 2^20, so we
// would need a /12 (32-20). to cover that many IP addresses but the
// largest valid prefix that can start from 192.168.0.0 is /13.
const int64_t firstAddressMaxBits = getLowestSetBit(firstIpAddress);

// Case 2. Find the largest prefix length to cover N IP addresses.
// The number of IP addresses within a valid prefix must be a power of 2
// but the IP count in our IP ranges may not be a power of 2. If it
// isn't exactly a power of 2, we find the highest power of 2 that the
// doesn't overrun the ipCount.

// If ipCount's bitLength is greater than the number of IP addresses (i.e.,
// not a power of 2), then use 1 bit less.
const int64_t ipCountBitLength = bitLength(ipCount);

const int128_t numIpAddress = static_cast<int128_t>(1) << ipCountBitLength;
const int64_t ipRangeMaxBits =
numIpAddress > ipCount ? ipCountBitLength - 1 : ipCountBitLength;
return std::min(firstAddressMaxBits, ipRangeMaxBits);
}

FOLLY_ALWAYS_INLINE static std::vector<std::tuple<int128_t, int8_t>>
generateMinIpPrefixes(
int128_t firstIpAddress,
int128_t lastIpAddress,
uint32_t ipVersionMaxBits) {
std::vector<std::tuple<int128_t, int8_t>> ipPrefixSlices;
// i.e., while firstIpAddress <= lastIpAddress
while (IPADDRESS()->compare(firstIpAddress, lastIpAddress) <= 0) {
// find the number of bits for the next prefix in the range
const auto rangeBits = findRangeBits(firstIpAddress, lastIpAddress);

const auto prefixLength = ipVersionMaxBits - rangeBits;
ipPrefixSlices.emplace_back(firstIpAddress, prefixLength);

int128_t ipCount = static_cast<int128_t>(1)
<< static_cast<int128_t>(ipVersionMaxBits - prefixLength);
firstIpAddress += ipCount;
}
return ipPrefixSlices;
}

FOLLY_ALWAYS_INLINE static std::vector<std::pair<int128_t, int128_t>>
mergeIpRanges(const std::vector<std::tuple<int128_t, int8_t>>& prefixes) {
std::vector<std::pair<int128_t, int128_t>> mergedRanges;
mergedRanges.reserve(prefixes.size());

int128_t firstIpAddress = std::get<0>(prefixes.front());
int128_t lastIpAddress = getIPSubnetMax(
std::get<0>(prefixes.front()), std::get<1>(prefixes.front()));

/*
There are four cases to cover for two IP ranges where range1.startIp <=
range2.startIp
1. Could be equal/duplicates.
[-------]
[-------]
In this case, we just ignore the second one.
2. Second could be subnet/contained within first.
[-------] OR [-------] OR [-------]
[---] [----] [----]
In this case we ignore the second one.
3. Second could be adjacent/contiguous with the first.
[-------]
[-------]
In this case we extend the range to include the last IP address of the
second one.
4. Second can be disjoint from the first.
[-------]
[-------]
In this case the first range is finalized, and the second range
becomes the current one.
*/

for (size_t i = 1; i < prefixes.size(); i++) {
int128_t nextFirstIpAddress = std::get<0>(prefixes[i]);
int128_t nextLastIpAddress =
getIPSubnetMax(std::get<0>(prefixes[i]), std::get<1>(prefixes[i]));

// If nextFirstIpAddress <= lastIpAddress then there is overlap.
// However, based on the properties of the input sorted array, this will
// always mean that the next* range is a subnet of [firstIpAddress,
// lastIpAddress]. We just ignore these prefixes since they are already
// covered (case 1 and case 2).
//
// i.e. nextFirstIpAddress > lastIpAddress -- the next range does not
// overlap the first
if (IPADDRESS()->compare(lastIpAddress, nextFirstIpAddress) < 0) {
// If they are not contiguous (case 4), finalize the range.
// Otherwise, extend the current range (case 3).
if (IPADDRESS()->compare(
lastIpAddress + static_cast<int128_t>(1), nextFirstIpAddress) !=
0) {
mergedRanges.emplace_back(firstIpAddress, lastIpAddress);
firstIpAddress = nextFirstIpAddress;
}
lastIpAddress = nextLastIpAddress;
}
}

mergedRanges.emplace_back(firstIpAddress, lastIpAddress);
return mergedRanges;
}
};

void registerIPAddressFunctions(const std::string& prefix) {
registerIPAddressType();
registerIPPrefixType();
Expand All @@ -194,6 +416,8 @@ void registerIPAddressFunctions(const std::string& prefix) {
{prefix + "is_subnet_of"});
registerFunction<IPSubnetOfFunction, bool, IPPrefix, IPPrefix>(
{prefix + "is_subnet_of"});
registerFunction<IPPrefixCollapseFunction, Array<IPPrefix>, Array<IPPrefix>>(
{prefix + "ip_prefix_collapse"});
}

} // namespace facebook::velox::functions
Loading

0 comments on commit 3e8a924

Please sign in to comment.