trie_search.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. /*
  2. * Copyright 2011 Google Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "tsk_assert.h"
  17. #include "string_util.h"
  18. #include "trie_search.h"
  19. #include <stdlib.h>
  20. #include <string.h>
  21. /*
  22. * Helper macro that chooses the node half-way between the start and
  23. * end nodes. Used for binary search.
  24. */
  25. #define MIDDLE(start, end) ((start) + ((((end) - (start)) + 1) / 2));
  26. /*
  27. * Global data structures used to perform the search. Should be
  28. * populated once at startup by a call to SetRegistryTables.
  29. */
  30. static const char* g_string_table = NULL;
  31. static const struct TrieNode* g_node_table = NULL;
  32. static size_t g_num_root_children = 0;
  33. static const REGISTRY_U16* g_leaf_node_table = NULL;
  34. static size_t g_leaf_node_table_offset = 0;
  35. /*
  36. * Create an "exception" version of the given component. For instance
  37. * if component is "foo", will return "!foo". The caller is
  38. * responsible for freeing the returned memory.
  39. */
  40. static char* StrDupExceptionComponent(const char* component) {
  41. /*
  42. * TODO(bmcquade): could use thread-local storage of sufficient size
  43. * to avoid this allocation. This should be invoked infrequently
  44. * enough that it's probably fine for us to perform the allocation.
  45. */
  46. const size_t component_len = strlen(component);
  47. char* exception_component = malloc(component_len + 2);
  48. if (exception_component == NULL) {
  49. return NULL;
  50. }
  51. memcpy(exception_component + 1, component, component_len);
  52. exception_component[0] = '!';
  53. exception_component[component_len + 1] = 0;
  54. return exception_component;
  55. }
  56. /*
  57. * Performs a binary search looking for value, between the nodes start
  58. * and end, inclusive. Would normally have static linkage but is made
  59. * public for testing.
  60. */
  61. static const struct TrieNode* FindNodeInRange(
  62. const char* value,
  63. const struct TrieNode* start,
  64. const struct TrieNode* end) {
  65. DCHECK(value != NULL);
  66. DCHECK(start != NULL);
  67. DCHECK(end != NULL);
  68. if (start > end) return NULL;
  69. while (1) {
  70. const struct TrieNode* candidate;
  71. const char* candidate_str;
  72. int result;
  73. DCHECK(start <= end);
  74. candidate = MIDDLE(start, end);
  75. candidate_str = g_string_table + candidate->string_table_offset;
  76. result = HostnamePartCmp(value, candidate_str);
  77. if (result == 0) return candidate;
  78. if (result > 0) {
  79. if (end == candidate) return NULL;
  80. start = candidate + 1;
  81. } else {
  82. if (start == candidate) return NULL;
  83. end = candidate - 1;
  84. }
  85. }
  86. }
  87. /*
  88. * Performs a binary search looking for value, between the nodes start
  89. * and end, inclusive. Would normally have static linkage but is made
  90. * public for testing.
  91. */
  92. static const char* FindLeafNodeInRange(
  93. const char* value,
  94. const REGISTRY_U16* start,
  95. const REGISTRY_U16* end) {
  96. DCHECK(value != NULL);
  97. DCHECK(start != NULL);
  98. DCHECK(end != NULL);
  99. if (start > end) return NULL;
  100. while (1) {
  101. const REGISTRY_U16* candidate;
  102. const char* candidate_str;
  103. int result;
  104. DCHECK(start <= end);
  105. candidate = MIDDLE(start, end);
  106. candidate_str = g_string_table + *candidate;
  107. result = HostnamePartCmp(value, candidate_str);
  108. if (result == 0) return candidate_str;
  109. if (result > 0) {
  110. if (end == candidate) return NULL;
  111. start = candidate + 1;
  112. } else {
  113. if (start == candidate) return NULL;
  114. end = candidate - 1;
  115. }
  116. }
  117. }
  118. /*
  119. * Searches to find a registry node with the given component
  120. * identifier and the given parent node. If parent is null, searches
  121. * starting from the root node.
  122. */
  123. const struct TrieNode* FindRegistryNode(const char* component,
  124. const struct TrieNode* parent) {
  125. const struct TrieNode* start;
  126. const struct TrieNode* end;
  127. const struct TrieNode* current;
  128. const struct TrieNode* exception;
  129. DCHECK(g_string_table != NULL);
  130. DCHECK(g_node_table != NULL);
  131. DCHECK(g_leaf_node_table != NULL);
  132. DCHECK(component != NULL);
  133. if (IsInvalidComponent(component)) {
  134. return NULL;
  135. }
  136. if (parent == NULL) {
  137. /* If parent is NULL, start the search at the root node. */
  138. start = g_node_table;
  139. end = start + (g_num_root_children - 1);
  140. } else {
  141. if (HasLeafChildren(parent) != 0) {
  142. /*
  143. * If the parent has leaf children, FindRegistryLeafNode should
  144. * have been called instead.
  145. */
  146. DCHECK(0);
  147. return NULL;
  148. }
  149. /* We'll be searching the specified parent node's children. */
  150. start = g_node_table + parent->first_child_offset;
  151. end = start + ((int) parent->num_children - 1);
  152. }
  153. current = FindNodeInRange(component, start, end);
  154. if (current != NULL) {
  155. /* Found a match. Return it. */
  156. return current;
  157. }
  158. /*
  159. * We didn't find an exact match, so see if there's a wildcard
  160. * match. From http://publicsuffix.org/format/: "The wildcard
  161. * character * (asterisk) matches any valid sequence of characters
  162. * in a hostname part. (Note: the list uses Unicode, not Punycode
  163. * forms, and is encoded using UTF-8.) Wildcards may only be used to
  164. * wildcard an entire level. That is, they must be surrounded by
  165. * dots (or implicit dots, at the beginning of a line)."
  166. */
  167. current = FindNodeInRange("*", start, end);
  168. if (current != NULL) {
  169. /*
  170. * If there was a wildcard match, see if there is a wildcard
  171. * exception match, and prefer it if so. From
  172. * http://publicsuffix.org/format/: "An exclamation mark (!) at
  173. * the start of a rule marks an exception to a previous wildcard
  174. * rule. An exception rule takes priority over any other matching
  175. * rule.".
  176. */
  177. char* exception_component = StrDupExceptionComponent(component);
  178. if (exception_component == NULL) {
  179. return NULL;
  180. }
  181. exception = FindNodeInRange(exception_component,
  182. start,
  183. end);
  184. free(exception_component);
  185. if (exception != NULL) {
  186. current = exception;
  187. }
  188. }
  189. return current;
  190. }
  191. const char* FindRegistryLeafNode(const char* component,
  192. const struct TrieNode* parent) {
  193. size_t offset;
  194. const REGISTRY_U16* leaf_start;
  195. const REGISTRY_U16* leaf_end;
  196. const char* match;
  197. const char* exception;
  198. DCHECK(g_string_table != NULL);
  199. DCHECK(g_node_table != NULL);
  200. DCHECK(g_leaf_node_table != NULL);
  201. DCHECK(component != NULL);
  202. DCHECK(parent != NULL);
  203. DCHECK(HasLeafChildren(parent) != 0);
  204. if (parent == NULL) {
  205. return NULL;
  206. }
  207. if (HasLeafChildren(parent) == 0) {
  208. return NULL;
  209. }
  210. if (IsInvalidComponent(component)) {
  211. return NULL;
  212. }
  213. offset = parent->first_child_offset - g_leaf_node_table_offset;
  214. leaf_start = g_leaf_node_table + offset;
  215. leaf_end = leaf_start + ((int) parent->num_children - 1);
  216. match = FindLeafNodeInRange(component,
  217. leaf_start,
  218. leaf_end);
  219. if (match != NULL) {
  220. return match;
  221. }
  222. /*
  223. * We didn't find an exact match, so see if there's a wildcard
  224. * match. From http://publicsuffix.org/format/: "The wildcard
  225. * character * (asterisk) matches any valid sequence of characters
  226. * in a hostname part. (Note: the list uses Unicode, not Punycode
  227. * forms, and is encoded using UTF-8.) Wildcards may only be used to
  228. * wildcard an entire level. That is, they must be surrounded by
  229. * dots (or implicit dots, at the beginning of a line)."
  230. */
  231. match = FindLeafNodeInRange("*", leaf_start, leaf_end);
  232. if (match != NULL) {
  233. /*
  234. * There was a wildcard match, so see if there is a wildcard
  235. * exception match, and prefer it if so. From
  236. * http://publicsuffix.org/format/: "An exclamation mark (!) at
  237. * the start of a rule marks an exception to a previous wildcard
  238. * rule. An exception rule takes priority over any other matching
  239. * rule.".
  240. */
  241. char* exception_component = StrDupExceptionComponent(component);
  242. if (exception_component == NULL) {
  243. return NULL;
  244. }
  245. exception = FindLeafNodeInRange(exception_component,
  246. leaf_start,
  247. leaf_end);
  248. free(exception_component);
  249. if (exception != NULL) {
  250. match = exception;
  251. }
  252. }
  253. return match;
  254. }
  255. const char* GetHostnamePart(size_t offset) {
  256. DCHECK(g_string_table != NULL);
  257. return g_string_table + offset;
  258. }
  259. int HasLeafChildren(const struct TrieNode* node) {
  260. if (node == NULL) { return 0; }
  261. if (node->first_child_offset < g_leaf_node_table_offset) return 0;
  262. return 1;
  263. }
  264. void SetRegistryTables(const char* string_table,
  265. const struct TrieNode* node_table,
  266. size_t num_root_children,
  267. const REGISTRY_U16* leaf_node_table,
  268. size_t leaf_node_table_offset) {
  269. g_string_table = string_table;
  270. g_node_table = node_table;
  271. g_num_root_children = num_root_children;
  272. g_leaf_node_table = leaf_node_table;
  273. g_leaf_node_table_offset = leaf_node_table_offset;
  274. }