Sometimes, spine caused a segmentation fault. Tested with 0.8.7g and the SVN version.
With GDB, i got to the following line:
poller.c, function poll_device:
Code: Select all
--- /z/spine/branches/main/poller.c 2011-03-24 11:53:58.000000000 +0200
+++ poller.c 2011-03-25 01:09:41.000000000 +0200
@@ -1052,7 +1085,7 @@
}
}
- free(poll_result);
+// free(poll_result);
SPINE_LOG_MEDIUM(("Device[%i] TH[%i] DS[%i] SS[%i] SERVER: %s, output: %s", device_id, device_thread, poller_items[i].local_data_id, php_process, poller_items[i].arg1, poller_items[i].result));
Patch #2
Sometimes, some devices doesn not answer to the SNMP queries right away.
I've made the following dirty fix. It works, 90% of the time polling is successfull.
Of course, it can be made a lot cleaner, but I don't really have the time for it:
Code: Select all
--- /z/spine/branches/main/poller.c 2011-03-24 11:53:58.000000000 +0200
+++ poller.c 2011-03-25 01:09:41.000000000 +0200
@@ -888,7 +888,23 @@
if (num_oids > 0) {
snmp_get_multi(device, snmp_oids, num_oids);
-
+ if (device->ignore_device)
+//Try again, after a delay of 0.05sec
+ {
+ device->ignore_device=FALSE;usleep(50000);
+ snmp_get_multi(device, snmp_oids, num_oids);
+ SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], retrying device '%s' [#1]", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
+ }
+
+
+ if (device->ignore_device)
+//Try yet again, this time after a delay of 0.5 sec.
+ {
+ device->ignore_device=FALSE;usleep(500000);
+ snmp_get_multi(device, snmp_oids, num_oids);
+ SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], retrying device '%s' [#2]", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
+ }
+
for (j = 0; j < num_oids; j++) {
if (device->ignore_device) {
SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], ignoring device '%s'", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
@@ -947,6 +963,23 @@
if (num_oids >= device->max_oids) {
snmp_get_multi(device, snmp_oids, num_oids);
+ if (device->ignore_device)
+//Try again, after a delay of 0.05sec
+ {
+ device->ignore_device=FALSE;usleep(50000);
+ snmp_get_multi(device, snmp_oids, num_oids);
+ SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], retrying device '%s' [#3]", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
+ }
+
+
+ if (device->ignore_device)
+//Try yet again, this time after a delay of 0.5 sec.
+ {
+ device->ignore_device=FALSE;usleep(500000);
+ snmp_get_multi(device, snmp_oids, num_oids);
+ SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], retrying device '%s' [#4]", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
+
+ }
for (j = 0; j < num_oids; j++) {
if (device->ignore_device) {
@@ -1077,6 +1110,22 @@
/* process last multi-get request if applicable */
if (num_oids > 0) {
snmp_get_multi(device, snmp_oids, num_oids);
+ if (device->ignore_device)
+//Try again, after a delay of 0.05sec
+ {
+ device->ignore_device=FALSE;usleep(50000);
+ snmp_get_multi(device, snmp_oids, num_oids);
+ SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], retrying device '%s' [#5]", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
+ }
+
+
+ if (device->ignore_device)
+//Try yet again, this time after a delay of 0.5 sec.
+ {
+ device->ignore_device=FALSE;usleep(500000);
+ snmp_get_multi(device, snmp_oids, num_oids);
+ SPINE_LOG(("Device[%i] TH[%i] DS[%i] WARNING: SNMP timeout detected [%i ms], retrying device '%s' [#6]", device_id, device_thread, poller_items[snmp_oids[j].array_position].local_data_id, device->snmp_timeout, device->hostname));
+ }
for (j = 0; j < num_oids; j++) {
if (device->ignore_device) {