rdp-algo.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. <?php
  2. //require_once("aken-simplify.php");
  3. //require_once("andrii-simplify.php");
  4. require_once("db-frontend.php");
  5. require_once("rdp.php");
  6. //getStandardDeviations();
  7. //testDataset();
  8. //testCacheDatabase();
  9. function testCacheDatabase() {
  10. // tests getting data from the cache, either all or a time range
  11. if(false) var_dump(CACHE_DB::getCachedData("A", "radon"));
  12. if(false) var_dump(CACHE_DB::getCachedData(
  13. "A",
  14. "radon",
  15. time() - (4 /* days */ * 24 * 60 * 60),
  16. time()
  17. ));
  18. // tests creating or refreshing the cache based on a time range
  19. if(false) CACHE_DB::createCacheDb(
  20. time() - (1 /* days */ * 24 * 60 * 60),
  21. time()
  22. ); else CACHE_DB::createCacheDb(0, PHP_INT_MAX, true);
  23. exit();
  24. }
  25. /* pulling a years worth of readings gets close to php.ini's max memory
  26. * of 128MB, with a single sensor reading every five minutes and a
  27. * year's worth of reading somehow PHP uses more than 75 MegaBytes
  28. *
  29. * try to free up memory as soon as we no longer need it
  30. */
  31. define("CLEAN_UP_MEMORY_AS_FAST_AS_POSSIBLE", NULL);
  32. // draw an interesting curve to test RDP against
  33. $curve = array(array(/* x value array */), array(/* y value array */));
  34. // sensor might get used in another code block
  35. $sensor;
  36. define("USE_SENSOR_READINGS", NULL);
  37. if(defined("USE_SENSOR_READINGS")) {
  38. // testing
  39. $sensors = new TheInternet();
  40. // overwrite demo curve
  41. $sensor = $sensors->getSensor("R");
  42. if(defined("CLEAN_UP_MEMORY_AS_FAST_AS_POSSIBLE")) {
  43. /* sensors is about 100K, we do not need it now that we have the one
  44. * we want
  45. */
  46. unset($sensors);
  47. }
  48. $data = new DataSqueeze();
  49. $before = memory_get_usage();
  50. var_dump("using {$before} bytes before rows");
  51. $rows = $data->getRows($sensor);
  52. //$curve = $data->getRows($sensor, "radon", );
  53. // convert sensor readings from array("time"=>x, "reading"=>y)
  54. $rowCount = count($rows[0]);
  55. for($i = $rowCount; $i--;) {
  56. $row = NULL;
  57. if(defined("CLEAN_UP_MEMORY_AS_FAST_AS_POSSIBLE")) {
  58. $row = array(array_pop($rows[0]), array_pop($rows[1]));
  59. } else {
  60. $row = $rows[$i];
  61. }
  62. $curve[0][] = $row[0];
  63. $curve[1][] = $row[1];
  64. }
  65. if(defined("CLEAN_UP_MEMORY_AS_FAST_AS_POSSIBLE")) {
  66. /* even though it is empty in PHP 5.3 the `rows` variable was eating
  67. * a MegaByte
  68. */
  69. unset($rows, $data);
  70. }
  71. $after = memory_get_usage();
  72. $diff = $after - $before;
  73. var_dump("using {$after} bytes after [$rowCount] rows diff:({$diff} bytes)");
  74. // rows were reversed by loop
  75. $curve[0] = array_reverse($curve[0]);
  76. $curve[1] = array_reverse($curve[1]);
  77. } else {
  78. for($i = 0; $i < 1000; $i++) {
  79. $x = ($i * .01);
  80. $var = array($i, exp(-$x) * cos(2 * M_PI * $x));
  81. $curve[0][] = $var[0];
  82. $curve[1][] = $var[1];
  83. }
  84. }
  85. define("TEST_DATA_SQUEEZE_FIR", NULL);
  86. if(defined("TEST_DATA_SQUEEZE_FIR")) {
  87. // run the data through a simple FIR filter
  88. $delay = /* number of samples until filter starts */ 10;
  89. $period = /* time between samples in seconds */ 1;
  90. /* [!] do not try to use one second for the period on a real sensor
  91. *
  92. * this would cause the filter to regenerate readings every second
  93. * instead of whatever the normal period would be, RadonEye for example
  94. * should be once every 300 seconds...
  95. */
  96. if(defined("USE_SENSOR_READINGS")) {
  97. $delay = $sensor->options->getSampleDepth();
  98. $period = $sensor->options->getSamplePeriod();
  99. var_dump("using delay: {$delay} samples, period: {$period} seconds");
  100. }
  101. $filter = new DataSqueeze();
  102. // process data
  103. $t0 = microtime(true);
  104. $fCurve = $filter->simpleFIR($curve, $delay, $period);
  105. /* [?] the filter returns floating point values, values read from the
  106. * database are integers, the filtered results should undergo quantization
  107. * to return them to the more familiar precision that they started from
  108. *
  109. * this will lose resolution that we gained from the filter but should be
  110. * minimal, assuming the values stored in the database are not normalized
  111. * eg: [-1..0..1]
  112. */
  113. for($i=count($fCurve[0]);$i--;) $fCurve[1][$i] = intval($fCurve[1][$i]);
  114. $t0 = "(cpu time " . number_format(1000 * (microtime(true) - $t0), 2) . "ms)";
  115. $filteredCount = count($fCurve);
  116. var_dump("to process the filter[{$filteredCount}]: {$t0}<br>");
  117. } else {
  118. $fCurve = $curve;
  119. }
  120. /* [?] smaller epsilon means more points and higher accuracy, but costs more
  121. * CPU resources and will take longer to transfer the data to the end user
  122. *
  123. * Emma points out that the epsilon will need to vary depending on the
  124. * units for each graph, for example a graph with a range of 0-1 will have
  125. * an epsilon less than one, whereas a graph with a range of 0-1000 should
  126. * use a larger epsilon, most likely greater than ten.
  127. */
  128. $epsilon = defined("USE_SENSOR_READINGS")
  129. ? $sensor->options->getSquishFactor()
  130. : 0.01;
  131. // initiate the object, let it copy the graph
  132. $testing = new RamerDouglasPeucker($fCurve);
  133. // run the algorithm
  134. $t0 = microtime(true);
  135. $rdpCurve = $testing->getRDP($epsilon);
  136. /* test some other libraries:
  137. *
  138. * there were two people who ported simplify.js, who claims to have made a
  139. * fast enough version of polyline simplification algorithm, they improved
  140. * speed by first reducing the number of points before running RDP using
  141. * radial coordinates
  142. *
  143. * I tested to see if the results were ten times faster but found that the
  144. * reduced resolution provided by the modified algorithm gave at most a 30%
  145. * improvement. [?] to test the libraries I needed to replace all source code
  146. * `'x'` and `'y'` with `0` and `1` to match this code's use of points.
  147. *
  148. * The result of this testing and the slowness of RDP in general is leading me
  149. * to want to cache results.
  150. *
  151. * these libraries are basically the same, but I tested both:
  152. * https://raw.githubusercontent.com/aken/simplify-php/master/simplify.php
  153. * https://raw.githubusercontent.com/andriichumak/simplify-php/master/Simplify.php
  154. */
  155. //$testin9 = new Simplify();
  156. //$rdpCurve = $testin9->run($curve, $epsilon);
  157. $t0 = "(cpu time " . number_format(1000 * (microtime(true) - $t0), 2) . "ms)";
  158. // output some stats for developers
  159. $count = count($curve[0]);
  160. $count = array($count, count($rdpCurve));
  161. echo "from {$count[0]} down to {$count[1]} using e = {$epsilon} {$t0}" . PHP_EOL;
  162. echo "<br>the perpendicular distance routine was called: {$testing->called}" . PHP_EOL;
  163. $t0 = microtime(true);
  164. $last = 0;
  165. for($i = 10000; $i--;) { $last = sqrt($i + $last); }
  166. $t0 = "(cpu time " . number_format(1000 * (microtime(true) - $t0), 2) . "ms)";
  167. echo "<br>res = {$last}... running the sqrt function n times {$t0}" . PHP_EOL;
  168. // convert to chartjs dataset, x values must be quoted
  169. $data = array();
  170. $data2 = array();
  171. /* testing chartjs to see if grabbing the min and max values for the x-axis
  172. * help with the problem where the values either smash together at x=0
  173. * when x values are presented as integers, or
  174. * where when I make them strings, they sometimes do not interlace, for small
  175. * datasets they interlace, for large datasets they are serial, eg:
  176. * [0...999...0...999]
  177. */
  178. $min = PHP_INT_MAX; $max = 0; $ymin = 0; $ymax = 0;
  179. $count = count($curve[0]);
  180. $max = max($max, $curve[0][count($curve[0]) -1]);
  181. $min = min($min, $curve[0][0]);
  182. for($i = $count; $i--;) {
  183. $o = intval(-1 + $count - $i);
  184. $data[] = "{x:{$curve[0][$o]}, y:{$curve[1][$o]}}";
  185. $ymax = max($ymax, $curve[1][$o]);
  186. if(defined("CLEAN_UP_MEMORY_AS_FAST_AS_POSSIBLE")) {
  187. unset($curve[0][$o], $curve[1][$o]);
  188. }
  189. }
  190. foreach($rdpCurve as $k => $v) {
  191. $data2[] = "{x:{$k}, y:{$v}}";
  192. $max = max($max, $k);
  193. $min = min($min, $k);
  194. $ymax = max($ymax, $v);
  195. }
  196. //var_dump($data2);
  197. /* testing:
  198. * for a years worth of data setting increasing the epsilon to an order
  199. * of magnitude higher gave us a reasonable (from 87886 readings to 1227
  200. * 1227 for 'R' between `1635368506` through `1664229791`, a sensor that reads
  201. * once per minute, it had several power failures, explaining the missing
  202. * readings)
  203. *
  204. * The noise in the readings is preserved when using the above settings, if
  205. * we increase the sample depth of the sensor to flatten the noise out, for a
  206. * year, two magnitudes more sample depth, and decreasing the default
  207. * epsilon by half magnitude gave a nice line
  208. *
  209. * [?] increasing sample depth also decreases the overall amplitude of the
  210. * signal but gives a easier to read graph
  211. *
  212. * [?] increasing or decreasing the epsilon adds or removes load to the
  213. * computer, there is a sweet spot that preserves enough detail while reducing
  214. * the number of data points in the output, the sweet spot depends on sensor,
  215. * it can be estimated by looking at deviations of the output graphs, we used
  216. * a tenth of the standard deviation as starting values for each sensor, using
  217. * a smaller value causes more work on the computer, but a tenth seemed like
  218. * not too big of a load on CPU while guarantying that one would not notice
  219. * a difference in the output graph.
  220. */
  221. if(/* redo with an offset */ true) {
  222. // offset the charts by the `max` amount so I can compare them
  223. $data2 = array();
  224. foreach($rdpCurve as $k => $v) {
  225. $v += $ymax;
  226. $data2[] = "{x:{$k}, y:{$v}}";
  227. }
  228. $ymax *= 2;
  229. }
  230. /* [!] warning about chartjs
  231. *
  232. * After spending about an hour trying to figure out why all points were
  233. * being rendered on the x-axis at 0 (zero) I found that by changing the
  234. * value of each point's x from integer to string caused the chart to render
  235. * correctly. I checked the documentation and found that the data structure
  236. * for a dataset is documented as supporting integer values:
  237. * `dataset:[{ data: [{x: 10, y: 20}, {x: 15, y: null}] }]`
  238. *
  239. * But testing shows this not to work. I have added quotes around each `x`
  240. * value, but that just seems wrong.
  241. * `dataset:[{ data: [{x: '10', y: 20}, {x: '15', y: null}] }]`
  242. *
  243. * [[ an update to the above warning, I figured this out a couple weeks later,
  244. * using `type: 'time'` worked as long as I included a `date` adapter for
  245. * chartjs, apparently in version 3 they removed time type parsing, but
  246. * they have a linear type, this worked fine without needing to put the
  247. * single quotes around `x` values. ]]
  248. *
  249. *
  250. * [?] chartjs performance options
  251. * normalized: true - informs that data indices are unique and sorted
  252. * parse: false - data has been prepared in their internal data format
  253. * min / max scales - calculate the minimum and maximum xy scales
  254. * minRotation / maxRotation - set to same value (rotation of x axis labels)
  255. *
  256. */
  257. echo "
  258. <div><canvas id='myChart'></canvas></div>
  259. <script src='http://cdn.jsdelivr.net/npm/chart.js'></script>
  260. <script>
  261. const config = {
  262. type: 'line',
  263. data: {
  264. datasets: [
  265. {label: 'sinusoid', data: [" . implode(",", $data) . "]},
  266. {label: 'rdp', data: [" . implode(",", $data2) . "],
  267. borderColor: 'rgba(255,0,0,0.5)'},
  268. ]
  269. },
  270. options: {
  271. normalized: true,
  272. parsing: false,
  273. animation: false,
  274. scales:{
  275. x: {
  276. type: 'linear',
  277. ticks: {
  278. maxRotation: 0,
  279. minRotation: 0,
  280. },
  281. min: {$min},
  282. max: {$max},
  283. },
  284. y: {
  285. min: {$ymin},
  286. max: {$ymax},
  287. }
  288. }
  289. }
  290. };
  291. const myChart = new Chart(document.getElementById('myChart'), config);
  292. </script>
  293. ";
  294. function getStandardDeviations() {
  295. /* for whatever reason users always seem to request all data from all time
  296. * even though they never use it
  297. *
  298. * the time to handle the user's request is negligible, but the time for them
  299. * to download the results takes a long time, both because it is a lot of data
  300. * and they are using a horrible transfer mechanism, HTTP
  301. *
  302. * HTTP alone slows down the transfer by at least 10x, then they are asking for
  303. * thousands upon thousands of data-points, but then, once they get all the
  304. * data, the screen that they use cannot even display all the data-points so
  305. * they just show a single data-point. together these two work to reduce the
  306. * response speed of the computer by at least 4 magnitudes. That's amazing!
  307. */
  308. // testing
  309. $sensors = new TheInternet();
  310. $sensor = $sensors->getSensor("A");
  311. $data = new DataSqueeze();
  312. /* using standard deviation can give us a baseline for what reasonable
  313. * values we can use for Ramer-Douglas-Peucker epsilon
  314. */
  315. foreach(array('A', 'B', 'C', 'R', 'S') as $v) {
  316. var_dump("std deviation {$v}", $data->getStdDeviation($sensors->getSensor($v)));
  317. }
  318. /* get standard deviations for each type of reading for each of the BME sensors
  319. *
  320. * use these values to help determine reasonable epsilon for
  321. * Ramer-Douglas-Peucker
  322. */
  323. $types = explode(" ", BME680_COLUMNS);
  324. array_shift(/* remove "time" from types */ $types);
  325. array_shift(/* remove "sensor" from types */ $types);
  326. foreach($types as $t) {
  327. foreach(array("H", "I", "J") as $s) {
  328. var_dump("std deviation {$t}: {$s}",
  329. $data->getStdDeviation($sensors->getSensor($s), $t)
  330. );
  331. }
  332. }
  333. }
  334. require_once("fantastic.php");
  335. function testDataset() {
  336. // TheInternet has a list of all the sensor clusters
  337. $internet = new TheInternet();
  338. // expand the list into all possible views of the data (Datasets)
  339. $datasets = Dataset::makeAllDatasetsFromInternet($internet);
  340. // we need an interface to get data from the database
  341. $data = new DataSqueeze();
  342. // just use one of the datasets for testing
  343. $testSensor = $datasets[0];
  344. // each dataset knows information about it's sensor
  345. var_dump($testSensor->getTypeName());
  346. $rows = $data->getRows($testSensor->parentSensorCluster, $testSensor->getTypeName());
  347. /* for FIR, period is fixed, delay can be modified to change how much
  348. * filtering is performed by the FIR, in general (but not always) users
  349. * will probably benefit from more filtering when "zoomed" out to give a
  350. * better idea of trends
  351. */
  352. $delay = $testSensor->parentSensorCluster->options->getSampleDepth();
  353. $period = $testSensor->parentSensorCluster->options->getSamplePeriod();
  354. $filtered = $data->simpleFIR($rows, $delay, $period);
  355. /* filtered data is should be run through the RDP algorithm to reduce
  356. * the number of points needed to represent a given line, one can be more
  357. * aggressive when heavy filtering is performed. Aggressive just means
  358. * a higher degree of leeway is given to the RDP algorithm when considering
  359. * if a line is considered near a sampled /filtered data point
  360. */
  361. $rdp = new RamerDouglasPeucker($filtered);
  362. $final = $rdp->getRDP(
  363. $testSensor->parentSensorCluster->options->getSquishFactor(
  364. $testSensor->getTypeName()
  365. )
  366. );
  367. var_dump($final);
  368. var_dump(count($datasets));
  369. }