Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===--- SourceLocationEncoding.h - Small serialized locations --*- C++ -*-===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | // | ||
| 9 | // Source locations are stored pervasively in the AST, making up a third of | ||
| 10 | // the size of typical serialized files. Storing them efficiently is important. | ||
| 11 | // | ||
| 12 | // We use integers optimized by VBR-encoding, because: | ||
| 13 | //  - when abbreviations cannot be used, VBR6 encoding is our only choice | ||
| 14 | //  - in the worst case a SourceLocation can be ~any 32-bit number, but in | ||
| 15 | //    practice they are highly predictable | ||
| 16 | // | ||
| 17 | // We encode the integer so that likely values encode as small numbers that | ||
| 18 | // turn into few VBR chunks: | ||
| 19 | //  - the invalid sentinel location is a very common value: it encodes as 0 | ||
| 20 | //  - the "macro or not" bit is stored at the bottom of the integer | ||
| 21 | //    (rather than at the top, as in memory), so macro locations can have | ||
| 22 | //    small representations. | ||
| 23 | //  - related locations (e.g. of a left and right paren pair) are usually | ||
| 24 | //    similar, so when encoding a sequence of locations we store only | ||
| 25 | //    differences between successive elements. | ||
| 26 | // | ||
| 27 | //===----------------------------------------------------------------------===// | ||
| 28 | |||
| 29 | #include <climits> | ||
| 30 | #include "clang/Basic/SourceLocation.h" | ||
| 31 | |||
| 32 | #ifndef LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H | ||
| 33 | #define LLVM_CLANG_SERIALIZATION_SOURCELOCATIONENCODING_H | ||
| 34 | |||
| 35 | namespace clang { | ||
| 36 | class SourceLocationSequence; | ||
| 37 | |||
| 38 | /// Serialized encoding of SourceLocations without context. | ||
| 39 | /// Optimized to have small unsigned values (=> small after VBR encoding). | ||
| 40 | /// | ||
| 41 | // Macro locations have the top bit set, we rotate by one so it is the low bit. | ||
| 42 | class SourceLocationEncoding { | ||
| 43 | using UIntTy = SourceLocation::UIntTy; | ||
| 44 | constexpr static unsigned UIntBits = CHAR_BIT * sizeof(UIntTy); | ||
| 45 | |||
| 46 | static UIntTy encodeRaw(UIntTy Raw) { | ||
| 47 | return (Raw << 1) | (Raw >> (UIntBits - 1)); | ||
| 48 |   } | ||
| 49 | static UIntTy decodeRaw(UIntTy Raw) { | ||
| 50 | return (Raw >> 1) | (Raw << (UIntBits - 1)); | ||
| 51 |   } | ||
| 52 | friend SourceLocationSequence; | ||
| 53 | |||
| 54 | public: | ||
| 55 | static uint64_t encode(SourceLocation Loc, | ||
| 56 | SourceLocationSequence * = nullptr); | ||
| 57 | static SourceLocation decode(uint64_t, SourceLocationSequence * = nullptr); | ||
| 58 | }; | ||
| 59 | |||
| 60 | /// Serialized encoding of a sequence of SourceLocations. | ||
| 61 | /// | ||
| 62 | /// Optimized to produce small values when locations with the sequence are | ||
| 63 | /// similar. Each element can be delta-encoded against the last nonzero element. | ||
| 64 | /// | ||
| 65 | /// Sequences should be started by creating a SourceLocationSequence::State, | ||
| 66 | /// and then passed around as SourceLocationSequence*. Example: | ||
| 67 | /// | ||
| 68 | ///   // establishes a sequence | ||
| 69 | ///   void EmitTopLevelThing() { | ||
| 70 | ///     SourceLocationSequence::State Seq; | ||
| 71 | ///     EmitContainedThing(Seq); | ||
| 72 | ///     EmitRecursiveThing(Seq); | ||
| 73 | ///   } | ||
| 74 | /// | ||
| 75 | ///   // optionally part of a sequence | ||
| 76 | ///   void EmitContainedThing(SourceLocationSequence *Seq = nullptr) { | ||
| 77 | ///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq)); | ||
| 78 | ///   } | ||
| 79 | /// | ||
| 80 | ///   // establishes a sequence if there isn't one already | ||
| 81 | ///   void EmitRecursiveThing(SourceLocationSequence *ParentSeq = nullptr) { | ||
| 82 | ///     SourceLocationSequence::State Seq(ParentSeq); | ||
| 83 | ///     Record.push_back(SourceLocationEncoding::encode(SomeLoc, Seq)); | ||
| 84 | ///     EmitRecursiveThing(Seq); | ||
| 85 | ///   } | ||
| 86 | /// | ||
| 87 | class SourceLocationSequence { | ||
| 88 | using UIntTy = SourceLocation::UIntTy; | ||
| 89 | using EncodedTy = uint64_t; | ||
| 90 | constexpr static auto UIntBits = SourceLocationEncoding::UIntBits; | ||
| 91 | static_assert(sizeof(EncodedTy) > sizeof(UIntTy), "Need one extra bit!"); | ||
| 92 | |||
| 93 |   // Prev stores the rotated last nonzero location. | ||
| 94 | UIntTy &Prev; | ||
| 95 | |||
| 96 |   // Zig-zag encoding turns small signed integers into small unsigned integers. | ||
| 97 |   // 0 => 0, -1 => 1, 1 => 2, -2 => 3, ... | ||
| 98 | static UIntTy zigZag(UIntTy V) { | ||
| 99 | UIntTy Sign = (V & (1 << (UIntBits - 1))) ? UIntTy(-1) : UIntTy(0); | ||
| 100 | return Sign ^ (V << 1); | ||
| 101 |   } | ||
| 102 | static UIntTy zagZig(UIntTy V) { return (V >> 1) ^ -(V & 1); } | ||
| 103 | |||
| 104 | SourceLocationSequence(UIntTy &Prev) : Prev(Prev) {} | ||
| 105 | |||
| 106 | EncodedTy encodeRaw(UIntTy Raw) { | ||
| 107 | if (Raw == 0) | ||
| 108 | return 0; | ||
| 109 | UIntTy Rotated = SourceLocationEncoding::encodeRaw(Raw); | ||
| 110 | if (Prev == 0) | ||
| 111 | return Prev = Rotated; | ||
| 112 | UIntTy Delta = Rotated - Prev; | ||
| 113 | Prev = Rotated; | ||
| 114 |     // Exactly one 33 bit value is possible! (1 << 32). | ||
| 115 |     // This is because we have two representations of zero: trivial & relative. | ||
| 116 | return 1 + EncodedTy{zigZag(Delta)}; | ||
| 117 |   } | ||
| 118 | UIntTy decodeRaw(EncodedTy Encoded) { | ||
| 119 | if (Encoded == 0) | ||
| 120 | return 0; | ||
| 121 | if (Prev == 0) | ||
| 122 | return SourceLocationEncoding::decodeRaw(Prev = Encoded); | ||
| 123 | return SourceLocationEncoding::decodeRaw(Prev += zagZig(Encoded - 1)); | ||
| 124 |   } | ||
| 125 | |||
| 126 | public: | ||
| 127 | SourceLocation decode(EncodedTy Encoded) { | ||
| 128 | return SourceLocation::getFromRawEncoding(decodeRaw(Encoded)); | ||
| 129 |   } | ||
| 130 | EncodedTy encode(SourceLocation Loc) { | ||
| 131 | return encodeRaw(Loc.getRawEncoding()); | ||
| 132 |   } | ||
| 133 | |||
| 134 | class State; | ||
| 135 | }; | ||
| 136 | |||
| 137 | /// This object establishes a SourceLocationSequence. | ||
| 138 | class SourceLocationSequence::State { | ||
| 139 | UIntTy Prev = 0; | ||
| 140 |   SourceLocationSequence Seq; | ||
| 141 | |||
| 142 | public: | ||
| 143 |   // If Parent is provided and non-null, then this root becomes part of that | ||
| 144 |   // enclosing sequence instead of establishing a new one. | ||
| 145 | State(SourceLocationSequence *Parent = nullptr) | ||
| 146 | : Seq(Parent ? Parent->Prev : Prev) {} | ||
| 147 | |||
| 148 |   // Implicit conversion for uniform use of roots vs propagated sequences. | ||
| 149 | operator SourceLocationSequence *() { return &Seq; } | ||
| 150 | }; | ||
| 151 | |||
| 152 | inline uint64_t SourceLocationEncoding::encode(SourceLocation Loc, | ||
| 153 | SourceLocationSequence *Seq) { | ||
| 154 | return Seq ? Seq->encode(Loc) : encodeRaw(Loc.getRawEncoding()); | ||
| 155 | } | ||
| 156 | inline SourceLocation | ||
| 157 | SourceLocationEncoding::decode(uint64_t Encoded, SourceLocationSequence *Seq) { | ||
| 158 | return Seq ? Seq->decode(Encoded) | ||
| 159 | : SourceLocation::getFromRawEncoding(decodeRaw(Encoded)); | ||
| 160 | } | ||
| 161 | |||
| 162 | } // namespace clang | ||
| 163 | #endif |