Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 5001b0a

Browse files
committedNov 12, 2023
Add cista::*::cstring type
This new type is able to store a trailing \0 character, without compromising one byte for storage when the string is a small-string. Storage of NUL character within data is also supported. This is inspired by felixguendling#187 (comment). See felixguendling#187.
1 parent f1a9c46 commit 5001b0a

File tree

5 files changed

+579
-0
lines changed

5 files changed

+579
-0
lines changed
 

‎include/cista/containers.h

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "cista/containers/array.h"
44
#include "cista/containers/bitset.h"
55
#include "cista/containers/bitvec.h"
6+
#include "cista/containers/cstring.h"
67
#include "cista/containers/fws_multimap.h"
78
#include "cista/containers/hash_map.h"
89
#include "cista/containers/hash_set.h"

‎include/cista/containers/cstring.h

+458
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,458 @@
1+
#pragma once
2+
3+
#include <cassert>
4+
#include <cinttypes>
5+
#include <cstring>
6+
7+
#include <ostream>
8+
#include <string>
9+
#include <string_view>
10+
11+
#include "cista/containers/ptr.h"
12+
#include "cista/type_traits.h"
13+
14+
namespace cista {
15+
16+
// This class is a generic string container that stores an extra \0 byte post
17+
// the last byte of the valid data. This makes sure the pointer returned by
18+
// data() can be passed as a C-string.
19+
//
20+
// The content stored within this container can contain binary data, that is,
21+
// any number of \0 bytes is permitted within [data(), data() + size()).
22+
template <typename Ptr = char const*>
23+
struct generic_cstring {
24+
using msize_t = std::uint32_t;
25+
using value_type = char;
26+
27+
static msize_t mstrlen(char const* s) noexcept {
28+
return static_cast<msize_t>(std::strlen(s));
29+
}
30+
31+
static constexpr struct owning_t {
32+
} owning{};
33+
static constexpr struct non_owning_t {
34+
} non_owning{};
35+
36+
constexpr generic_cstring() noexcept {}
37+
~generic_cstring() noexcept { reset(); }
38+
39+
generic_cstring(std::string_view s, owning_t const) { set_owning(s); }
40+
generic_cstring(std::string_view s, non_owning_t const) { set_non_owning(s); }
41+
generic_cstring(std::string const& s, owning_t const) { set_owning(s); }
42+
generic_cstring(std::string const& s, non_owning_t const) {
43+
set_non_owning(s);
44+
}
45+
generic_cstring(char const* s, owning_t const) {
46+
set_owning(s, s != nullptr ? mstrlen(s) : 0);
47+
}
48+
generic_cstring(char const* s, non_owning_t const) { set_non_owning(s); }
49+
50+
char* begin() noexcept { return data(); }
51+
char* end() noexcept { return data() + size(); }
52+
char const* begin() const noexcept { return data(); }
53+
char const* end() const noexcept { return data() + size(); }
54+
55+
friend char const* begin(generic_cstring const& s) { return s.begin(); }
56+
friend char* begin(generic_cstring& s) { return s.begin(); }
57+
friend char const* end(generic_cstring const& s) { return s.end(); }
58+
friend char* end(generic_cstring& s) { return s.end(); }
59+
60+
bool is_short() const noexcept { return s_.remaining_ >= 0; }
61+
62+
bool is_owning() const { return is_short() || h_.self_allocated_; }
63+
64+
void reset() noexcept {
65+
if (!is_short() && h_.self_allocated_) {
66+
std::free(data());
67+
}
68+
s_ = stack{};
69+
}
70+
71+
void set_owning(std::string const& s) {
72+
set_owning(s.data(), static_cast<msize_t>(s.size()));
73+
}
74+
75+
void set_owning(std::string_view s) {
76+
set_owning(s.data(), static_cast<msize_t>(s.size()));
77+
}
78+
79+
void set_owning(char const* str) {
80+
set_owning(str, str != nullptr ? mstrlen(str) : 0);
81+
}
82+
83+
static constexpr msize_t short_length_limit = 15U;
84+
85+
void set_owning(char const* str, msize_t const len) {
86+
reset();
87+
if (str == nullptr || len == 0U) {
88+
return;
89+
}
90+
s_.remaining_ = static_cast<int8_t>(
91+
std::max(static_cast<int32_t>(short_length_limit - len), -1));
92+
if (is_short()) {
93+
std::memcpy(s_.s_, str, len);
94+
} else {
95+
h_ = heap(len, owning);
96+
std::memcpy(data(), str, len);
97+
}
98+
}
99+
100+
void set_non_owning(std::string const& v) {
101+
set_non_owning(v.data(), static_cast<msize_t>(v.size()));
102+
}
103+
104+
void set_non_owning(std::string_view v) {
105+
set_non_owning(v.data(), static_cast<msize_t>(v.size()));
106+
}
107+
108+
void set_non_owning(char const* str) {
109+
set_non_owning(str, str != nullptr ? mstrlen(str) : 0);
110+
}
111+
112+
void set_non_owning(char const* str, msize_t const len) {
113+
reset();
114+
h_ = heap(str, len, non_owning);
115+
}
116+
117+
void move_from(generic_cstring&& s) noexcept {
118+
std::memcpy(static_cast<void*>(this), &s, sizeof(*this));
119+
if constexpr (std::is_pointer_v<Ptr>) {
120+
std::memset(static_cast<void*>(&s), 0, sizeof(*this));
121+
} else if (!s.is_short()) {
122+
h_.ptr_ = s.h_.ptr_;
123+
s.s_ = stack{};
124+
}
125+
}
126+
127+
void copy_from(generic_cstring const& s) {
128+
reset();
129+
if (s.is_short()) {
130+
std::memcpy(static_cast<void*>(this), &s, sizeof(s));
131+
} else if (s.h_.self_allocated_) {
132+
set_owning(s.data(), s.size());
133+
} else {
134+
set_non_owning(s.data(), s.size());
135+
}
136+
}
137+
138+
bool empty() const noexcept { return size() == 0U; }
139+
std::string_view view() const noexcept { return {data(), size()}; }
140+
std::string str() const { return {data(), size()}; }
141+
142+
operator std::string_view() const { return view(); }
143+
144+
char& operator[](std::size_t const i) noexcept { return data()[i]; }
145+
char const& operator[](std::size_t const i) const noexcept {
146+
return data()[i];
147+
}
148+
149+
friend std::ostream& operator<<(std::ostream& out, generic_cstring const& s) {
150+
return out << s.view();
151+
}
152+
153+
friend bool operator==(generic_cstring const& a,
154+
generic_cstring const& b) noexcept {
155+
return a.view() == b.view();
156+
}
157+
158+
friend bool operator!=(generic_cstring const& a,
159+
generic_cstring const& b) noexcept {
160+
return a.view() != b.view();
161+
}
162+
163+
friend bool operator<(generic_cstring const& a,
164+
generic_cstring const& b) noexcept {
165+
return a.view() < b.view();
166+
}
167+
168+
friend bool operator>(generic_cstring const& a,
169+
generic_cstring const& b) noexcept {
170+
return a.view() > b.view();
171+
}
172+
173+
friend bool operator<=(generic_cstring const& a,
174+
generic_cstring const& b) noexcept {
175+
return a.view() <= b.view();
176+
}
177+
178+
friend bool operator>=(generic_cstring const& a,
179+
generic_cstring const& b) noexcept {
180+
return a.view() >= b.view();
181+
}
182+
183+
friend bool operator==(generic_cstring const& a,
184+
std::string_view b) noexcept {
185+
return a.view() == b;
186+
}
187+
188+
friend bool operator!=(generic_cstring const& a,
189+
std::string_view b) noexcept {
190+
return a.view() != b;
191+
}
192+
193+
friend bool operator<(generic_cstring const& a, std::string_view b) noexcept {
194+
return a.view() < b;
195+
}
196+
197+
friend bool operator>(generic_cstring const& a, std::string_view b) noexcept {
198+
return a.view() > b;
199+
}
200+
201+
friend bool operator<=(generic_cstring const& a,
202+
std::string_view b) noexcept {
203+
return a.view() <= b;
204+
}
205+
206+
friend bool operator>=(generic_cstring const& a,
207+
std::string_view b) noexcept {
208+
return a.view() >= b;
209+
}
210+
211+
friend bool operator==(std::string_view a,
212+
generic_cstring const& b) noexcept {
213+
return a == b.view();
214+
}
215+
216+
friend bool operator!=(std::string_view a,
217+
generic_cstring const& b) noexcept {
218+
return a != b.view();
219+
}
220+
221+
friend bool operator<(std::string_view a, generic_cstring const& b) noexcept {
222+
return a < b.view();
223+
}
224+
225+
friend bool operator>(std::string_view a, generic_cstring const& b) noexcept {
226+
return a > b.view();
227+
}
228+
229+
friend bool operator<=(std::string_view a,
230+
generic_cstring const& b) noexcept {
231+
return a <= b.view();
232+
}
233+
234+
friend bool operator>=(std::string_view a,
235+
generic_cstring const& b) noexcept {
236+
return a >= b.view();
237+
}
238+
239+
friend bool operator==(generic_cstring const& a, char const* b) noexcept {
240+
return a.view() == std::string_view{b};
241+
}
242+
243+
friend bool operator!=(generic_cstring const& a, char const* b) noexcept {
244+
return a.view() != std::string_view{b};
245+
}
246+
247+
friend bool operator<(generic_cstring const& a, char const* b) noexcept {
248+
return a.view() < std::string_view{b};
249+
}
250+
251+
friend bool operator>(generic_cstring const& a, char const* b) noexcept {
252+
return a.view() > std::string_view{b};
253+
}
254+
255+
friend bool operator<=(generic_cstring const& a, char const* b) noexcept {
256+
return a.view() <= std::string_view{b};
257+
}
258+
259+
friend bool operator>=(generic_cstring const& a, char const* b) noexcept {
260+
return a.view() >= std::string_view{b};
261+
}
262+
263+
friend bool operator==(char const* a, generic_cstring const& b) noexcept {
264+
return std::string_view{a} == b.view();
265+
}
266+
267+
friend bool operator!=(char const* a, generic_cstring const& b) noexcept {
268+
return std::string_view{a} != b.view();
269+
}
270+
271+
friend bool operator<(char const* a, generic_cstring const& b) noexcept {
272+
return std::string_view{a} < b.view();
273+
}
274+
275+
friend bool operator>(char const* a, generic_cstring const& b) noexcept {
276+
return std::string_view{a} > b.view();
277+
}
278+
279+
friend bool operator<=(char const* a, generic_cstring const& b) noexcept {
280+
return std::string_view{a} <= b.view();
281+
}
282+
283+
friend bool operator>=(char const* a, generic_cstring const& b) noexcept {
284+
return std::string_view{a} >= b.view();
285+
}
286+
287+
char const* internal_data() const noexcept {
288+
if constexpr (std::is_pointer_v<Ptr>) {
289+
return is_short() ? s_.s_ : h_.ptr_;
290+
} else {
291+
return is_short() ? s_.s_ : h_.ptr_.get();
292+
}
293+
}
294+
295+
char* data() noexcept { return const_cast<char*>(internal_data()); }
296+
char const* data() const noexcept { return internal_data(); }
297+
298+
msize_t size() const noexcept { return is_short() ? s_.size() : h_.size(); }
299+
300+
struct heap {
301+
Ptr ptr_{nullptr};
302+
std::uint32_t size_{0};
303+
bool self_allocated_{false};
304+
char __fill__[sizeof(uintptr_t) == 8 ? 2 : 6]{0};
305+
int8_t minus_one_{-1}; // The offset of this field needs to match the
306+
// offset of stack::remaining_ below.
307+
308+
heap() = default;
309+
heap(msize_t len, owning_t) {
310+
ptr_ = new char[len + 1];
311+
if (ptr_ == nullptr) {
312+
throw std::bad_alloc{};
313+
}
314+
size_ = len;
315+
self_allocated_ = true;
316+
}
317+
heap(Ptr ptr, msize_t len, non_owning_t) {
318+
ptr_ = ptr;
319+
size_ = len;
320+
}
321+
322+
msize_t size() const { return size_; }
323+
};
324+
325+
struct stack {
326+
char s_[short_length_limit]{0};
327+
int8_t remaining_{
328+
short_length_limit}; // The remaining capacity the inline buffer still
329+
// has. A negative value indicates the buffer is
330+
// not inline. In case the inline buffer is fully
331+
// occupied, this field also serves as a null
332+
// terminator.
333+
334+
msize_t size() const {
335+
assert(remaining_ >= 0);
336+
return short_length_limit - static_cast<msize_t>(remaining_);
337+
}
338+
};
339+
340+
union {
341+
heap h_;
342+
stack s_{};
343+
};
344+
};
345+
346+
template <typename Ptr>
347+
struct basic_cstring : public generic_cstring<Ptr> {
348+
using base = generic_cstring<Ptr>;
349+
350+
using base::base;
351+
using base::operator std::string_view;
352+
353+
friend std::ostream& operator<<(std::ostream& out, basic_cstring const& s) {
354+
return out << s.view();
355+
}
356+
357+
explicit operator std::string() const { return {base::data(), base::size()}; }
358+
359+
basic_cstring(std::string_view s) : base{s, base::owning} {}
360+
basic_cstring(std::string const& s) : base{s, base::owning} {}
361+
basic_cstring(char const* s) : base{s, base::owning} {}
362+
basic_cstring(char const* s, typename base::msize_t const len)
363+
: base{s, len, base::owning} {}
364+
365+
basic_cstring(basic_cstring const& o) : base{o.view(), base::owning} {}
366+
basic_cstring(basic_cstring&& o) { base::move_from(std::move(o)); }
367+
368+
basic_cstring& operator=(basic_cstring const& o) {
369+
base::set_owning(o.data(), o.size());
370+
return *this;
371+
}
372+
373+
basic_cstring& operator=(basic_cstring&& o) {
374+
base::move_from(std::move(o));
375+
return *this;
376+
}
377+
378+
basic_cstring& operator=(char const* s) {
379+
base::set_owning(s);
380+
return *this;
381+
}
382+
basic_cstring& operator=(std::string const& s) {
383+
base::set_owning(s);
384+
return *this;
385+
}
386+
basic_cstring& operator=(std::string_view s) {
387+
base::set_owning(s);
388+
return *this;
389+
}
390+
};
391+
392+
template <typename Ptr>
393+
struct basic_cstring_view : public generic_cstring<Ptr> {
394+
using base = generic_cstring<Ptr>;
395+
396+
using base::base;
397+
using base::operator std::string_view;
398+
399+
friend std::ostream& operator<<(std::ostream& out,
400+
basic_cstring_view const& s) {
401+
return out << s.view();
402+
}
403+
404+
basic_cstring_view(std::string_view s) : base{s, base::non_owning} {}
405+
basic_cstring_view(std::string const& s) : base{s, base::non_owning} {}
406+
basic_cstring_view(char const* s) : base{s, base::non_owning} {}
407+
basic_cstring_view(char const* s, typename base::msize_t const len)
408+
: base{s, len, base::non_owning} {}
409+
410+
basic_cstring_view(basic_cstring_view const& o) {
411+
base::set_non_owning(o.data(), o.size());
412+
}
413+
basic_cstring_view(basic_cstring_view&& o) {
414+
base::set_non_owning(o.data(), o.size());
415+
}
416+
basic_cstring_view& operator=(basic_cstring_view const& o) {
417+
base::set_non_owning(o.data(), o.size());
418+
return *this;
419+
}
420+
basic_cstring_view& operator=(basic_cstring_view&& o) {
421+
base::set_non_owning(o.data(), o.size());
422+
return *this;
423+
}
424+
425+
basic_cstring_view& operator=(char const* s) {
426+
base::set_non_owning(s);
427+
return *this;
428+
}
429+
basic_cstring_view& operator=(std::string_view s) {
430+
base::set_non_owning(s);
431+
return *this;
432+
}
433+
basic_cstring_view& operator=(std::string const& s) {
434+
base::set_non_owning(s);
435+
return *this;
436+
}
437+
};
438+
439+
template <typename Ptr>
440+
struct is_string_helper<generic_cstring<Ptr>> : std::true_type {};
441+
442+
template <typename Ptr>
443+
struct is_string_helper<basic_cstring<Ptr>> : std::true_type {};
444+
445+
template <typename Ptr>
446+
struct is_string_helper<basic_cstring_view<Ptr>> : std::true_type {};
447+
448+
namespace raw {
449+
using generic_cstring = generic_cstring<offset::ptr<char const>>;
450+
using cstring = basic_cstring<offset::ptr<char const>>;
451+
} // namespace raw
452+
453+
namespace offset {
454+
using generic_cstring = generic_cstring<offset::ptr<char const>>;
455+
using cstring = basic_cstring<offset::ptr<char const>>;
456+
} // namespace offset
457+
458+
} // namespace cista

‎include/cista/hashing.h

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <tuple>
99
#include <type_traits>
1010

11+
#include "cista/containers/cstring.h"
1112
#include "cista/containers/offset_ptr.h"
1213
#include "cista/containers/pair.h"
1314
#include "cista/containers/string.h"

‎include/cista/serialization.h

+34
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,35 @@ void serialize(Ctx& c,
260260
pos + cista_member_offset(Type, element_count_));
261261
}
262262

263+
template <typename Ctx, typename Ptr>
264+
void serialize(Ctx& c, generic_cstring<Ptr> const* origin, offset_t const pos) {
265+
using Type = generic_cstring<Ptr>;
266+
267+
if (origin->is_short()) {
268+
return;
269+
}
270+
271+
const auto* data = origin->data();
272+
auto size = origin->size();
273+
std::string buf;
274+
if (!origin->is_owning()) {
275+
buf = origin->str();
276+
data = buf.data();
277+
size = buf.size();
278+
}
279+
auto capacity = size + 1;
280+
281+
auto const start = c.write(data, capacity);
282+
c.write(pos + cista_member_offset(Type, h_.ptr_),
283+
convert_endian<Ctx::MODE>(start - cista_member_offset(Type, h_.ptr_) -
284+
pos));
285+
c.write(pos + cista_member_offset(Type, h_.size_),
286+
convert_endian<Ctx::MODE>(origin->h_.size_));
287+
c.write(pos + cista_member_offset(Type, h_.self_allocated_), false);
288+
c.write(pos + cista_member_offset(Type, h_.minus_one_),
289+
static_cast<char>(-1));
290+
}
291+
263292
template <typename Ctx, typename Ptr>
264293
void serialize(Ctx& c, basic_string<Ptr> const* origin, offset_t const pos) {
265294
serialize(c, static_cast<generic_string<Ptr> const*>(origin), pos);
@@ -271,6 +300,11 @@ void serialize(Ctx& c, basic_string_view<Ptr> const* origin,
271300
serialize(c, static_cast<generic_string<Ptr> const*>(origin), pos);
272301
}
273302

303+
template <typename Ctx, typename Ptr>
304+
void serialize(Ctx& c, basic_cstring<Ptr> const* origin, offset_t const pos) {
305+
serialize(c, static_cast<generic_cstring<Ptr> const*>(origin), pos);
306+
}
307+
274308
template <typename Ctx, typename T, typename Ptr>
275309
void serialize(Ctx& c, basic_unique_ptr<T, Ptr> const* origin,
276310
offset_t const pos) {

‎test/cstring_test.cc

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#include <iostream>
2+
3+
#include "doctest.h"
4+
5+
#ifdef SINGLE_HEADER
6+
#include "cista.h"
7+
#else
8+
#include "cista/containers/cstring.h"
9+
#include "cista/hash.h"
10+
#endif
11+
12+
using cista::raw::cstring;
13+
14+
constexpr auto const CORNER_CASE_SHORT_14 = "01234567891234";
15+
constexpr auto const CORNER_CASE_SHORT_15 = "012345678912345";
16+
constexpr auto const CORNER_CASE_LONG_16 = "0123456789123456";
17+
constexpr auto const LONG_STR = "hello world hello world";
18+
constexpr auto const SHORT_STR = "hello world";
19+
20+
TEST_CASE("cstring init") {
21+
auto s = cstring{};
22+
CHECK(s.is_short());
23+
CHECK(s.size() == 0);
24+
CHECK(s.data() != nullptr);
25+
}
26+
27+
TEST_CASE("cstring long short corner 14") {
28+
auto s = cstring{CORNER_CASE_SHORT_14, cstring::owning};
29+
CHECK(s.is_short());
30+
CHECK(s.size() == std::strlen(CORNER_CASE_SHORT_14));
31+
CHECK(s.view() == CORNER_CASE_SHORT_14);
32+
}
33+
34+
TEST_CASE("cstring long short corner 15") {
35+
auto s = cstring{CORNER_CASE_SHORT_15, cstring::owning};
36+
CHECK(s.is_short());
37+
CHECK(s.size() == std::strlen(CORNER_CASE_SHORT_15));
38+
CHECK(s.view() == CORNER_CASE_SHORT_15);
39+
}
40+
41+
TEST_CASE("cstring long short corner 16") {
42+
auto s = cstring{CORNER_CASE_LONG_16, cstring::owning};
43+
CHECK(!s.is_short());
44+
CHECK(s.size() == std::strlen(CORNER_CASE_LONG_16));
45+
CHECK(s.view() == CORNER_CASE_LONG_16);
46+
}
47+
48+
TEST_CASE("cstring long short") {
49+
auto s = cstring{SHORT_STR, cstring::owning};
50+
CHECK(s.view() == SHORT_STR);
51+
CHECK(s.is_short());
52+
53+
s.set_owning(CORNER_CASE_LONG_16);
54+
CHECK(!s.is_short());
55+
CHECK(s.view() == CORNER_CASE_LONG_16);
56+
57+
s.set_owning(LONG_STR);
58+
CHECK(!s.is_short());
59+
CHECK(s.view() == LONG_STR);
60+
}
61+
62+
TEST_CASE("cstring dealloc long to short") {
63+
cstring s = "one two";
64+
CHECK(s.size() == std::strlen("one two"));
65+
CHECK(s.is_short());
66+
s.set_non_owning("");
67+
}
68+
69+
TEST_CASE("cstring copy assign and copy construct") {
70+
auto s0 = cstring{LONG_STR, cstring::owning};
71+
auto s1 = cstring{s0};
72+
CHECK(s0 == s1);
73+
CHECK(s1.view() == LONG_STR);
74+
75+
cstring s2;
76+
s2 = s0;
77+
CHECK(s0 == s2);
78+
CHECK(s2.view() == LONG_STR);
79+
}
80+
81+
TEST_CASE("cstring hash") {
82+
auto str = cstring{""};
83+
auto h = cista::hash(str, cista::BASE_HASH);
84+
CHECK(cista::BASE_HASH == h);
85+
}

0 commit comments

Comments
 (0)
Please sign in to comment.