On a new cluster installation I have some problems on remote node
informations. gmond and greceptor are ok on nodes, as well as frontend. No
problem with gmetad on frontend too.
But, on the webpage I have the famous :
Cannot find any metrics for selected cluster "clusterIBMM", exiting.
Check ganglia XML tree (telnet 127.0.0.1 8652)
I have tried the telnet :
Trying 127.0.0.1...
Connected to localhost.localdomain (127.0.0.1).
Escape character is '^]'.
<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
<!DOCTYPE GANGLIA_XML [
<!ELEMENT GANGLIA_XML (GRID|CLUSTER|HOST)*>
<!ATTLIST GANGLIA_XML VERSION CDATA #REQUIRED>
<!ATTLIST GANGLIA_XML SOURCE CDATA #REQUIRED>
<!ELEMENT GRID (CLUSTER | GRID | HOSTS | METRICS)*>
<!ATTLIST GRID NAME CDATA #REQUIRED>
<!ATTLIST GRID AUTHORITY CDATA #REQUIRED>
<!ATTLIST GRID LOCALTIME CDATA #IMPLIED>
<!ELEMENT CLUSTER (HOST | HOSTS | METRICS)*>
<!ATTLIST CLUSTER NAME CDATA #REQUIRED>
<!ATTLIST CLUSTER OWNER CDATA #IMPLIED>
<!ATTLIST CLUSTER LATLONG CDATA #IMPLIED>
<!ATTLIST CLUSTER URL CDATA #IMPLIED>
<!ATTLIST CLUSTER LOCALTIME CDATA #REQUIRED>
<!ELEMENT HOST (METRIC)*>
<!ATTLIST HOST NAME CDATA #REQUIRED>
<!ATTLIST HOST IP CDATA #REQUIRED>
<!ATTLIST HOST LOCATION CDATA #IMPLIED>
<!ATTLIST HOST REPORTED CDATA #REQUIRED>
<!ATTLIST HOST TN CDATA #IMPLIED>
<!ATTLIST HOST TMAX CDATA #IMPLIED>
<!ATTLIST HOST DMAX CDATA #IMPLIED>
<!ATTLIST HOST GMOND_STARTED CDATA #IMPLIED>
<!ELEMENT METRIC EMPTY>
<!ATTLIST METRIC NAME CDATA #REQUIRED>
<!ATTLIST METRIC VAL CDATA #REQUIRED>
<!ATTLIST METRIC TYPE (string | int8 | uint8 | int16 | uint16 | int32
| uint32 | float | double | timestamp) #REQUIRED>
<!ATTLIST METRIC UNITS CDATA #IMPLIED>
<!ATTLIST METRIC TN CDATA #IMPLIED>
<!ATTLIST METRIC TMAX CDATA #IMPLIED>
<!ATTLIST METRIC DMAX CDATA #IMPLIED>
<!ATTLIST METRIC SLOPE (zero | positive | negative | both |
unspecified) #IMPLIED>
<!ATTLIST METRIC SOURCE (gmond | gmetric) #REQUIRED>
<!ELEMENT HOSTS EMPTY>
<!ATTLIST HOSTS UP CDATA #REQUIRED>
<!ATTLIST HOSTS DOWN CDATA #REQUIRED>
<!ATTLIST HOSTS SOURCE (gmond | gmetric | gmetad) #REQUIRED>
<!ELEMENT METRICS EMPTY>
<!ATTLIST METRICS NAME CDATA #REQUIRED>
<!ATTLIST METRICS SUM CDATA #REQUIRED>
<!ATTLIST METRICS NUM CDATA #REQUIRED>
<!ATTLIST METRICS TYPE (string | int8 | uint8 | int16 | uint16 | int32
| uint32 | float | double | timestamp) #REQUIRED>
<!ATTLIST METRICS UNITS CDATA #IMPLIED>
<!ATTLIST METRICS SLOPE (zero | positive | negative | both |
unspecified) #IMPLIED>
<!ATTLIST METRICS SOURCE (gmond | gmetric) #REQUIRED>
]>
<GANGLIA_XML VERSION="3.0.7" SOURCE="gmetad">
<GRID NAME="unspecified" AUTHORITY="
http://cluster.ibmm.univ-montp1.fr/ganglia/" LOCALTIME="1263393509">
<CLUSTER NAME="clusterIBMM" LOCALTIME="1263393507" OWNER="IBMM" LATLONG=" "
URL="http://www.ibmm.univ-montp1.fr/">
</CLUSTER>
</GRID>
</GANGLIA_XML>
Connection closed by foreign host.
The most insterssant things are :
* There's nothing in the rrds directory in /var/lib/ganglia but the rights
are ok (user nobody).
* The output of "gstat -a" indicates no node... :
[root@cluster init.d]# gstat -a
CLUSTER INFORMATION
Name: clusterIBMM
Hosts: 0
Gexec Hosts: 0
Dead Hosts: 0
Localtime: Wed Jan 13 15:40:17 2010
There are no hosts up at this time
while they are alive...
gmond config file :
[root@cluster init.d]# more /etc/gmond.conf
/*
Ganglia gmond configuration file for cluster
DO NOT EDIT - Automatically generated by dbreport
*/
/* Global Configuration */
globals {
daemonize = yes
setuid = yes
user = nobody
debug_level = 0
max_udp_msg_len = 1472
mute = no
deaf = no
host_dmax = 0 /*secs */
cleanup_threshold = 300 /*secs */
gexec = no
}
/* Cluster Specific attributes */
cluster {
name = "clusterIBMM"
owner = "IBMM"
latlong = " "
url = "http://www.ibmm.univ-montp1.fr/"
}
/* Host configuration */
host {
location="0,0,0"
}
/* UDP Channels for Send and Recv */
udp_recv_channel {
mcast_join = 229.254.66.56
port = 8649
}
udp_send_channel {
mcast_join = 229.254.66.56
port = 8649
}
/* TCP Accept Channel */
tcp_accept_channel {
port = 8649
acl {
default = "deny"
access {
ip = 127.0.0.1
mask = 32
action = "allow"
}
access {
ip = 192.168.0.0
mask = 24
action = "allow"
}
access {
ip = 192.168.1.0
mask = 24
action = "allow"
}
}
}
I have tried to restart all the ganglia services, reboot all nodes
(including the frontend)...
Here are my iptables rules :
[root@cluster init.d]# iptables --list
Chain INPUT (policy ACCEPT)
target prot opt source destination
ACCEPT all -- anywhere anywhere
ACCEPT all -- anywhere anywhere
ACCEPT tcp -- anywhere anywhere state NEW tcp
dpt:ssh
ACCEPT tcp -- 192.168.212.0/24 anywhere state NEW tcp
dpt:https
ACCEPT tcp -- 192.168.212.0/24 anywhere state NEW tcp
dpt:http
ACCEPT icmp -- anywhere anywhere icmp any
ACCEPT all -- anywhere anywhere state
RELATED,ESTABLISHED
REJECT udp -- anywhere anywhere udp dpts:0:1024
reject-with icmp-port-unreachable
REJECT tcp -- anywhere anywhere tcp dpts:0:1024
reject-with icmp-port-unreachable
REJECT udp -- anywhere anywhere udp dpt:8649
reject-with icmp-port-unreachable
REJECT udp -- anywhere anywhere udp dpt:mysql
reject-with icmp-port-unreachable
REJECT tcp -- anywhere anywhere tcp dpt:mysql
reject-with icmp-port-unreachable
Chain FORWARD (policy DROP)
target prot opt source destination
ACCEPT all -- anywhere anywhere state
NEW,RELATED,ESTABLISHED
ACCEPT all -- anywhere anywhere
Chain OUTPUT (policy ACCEPT)
target prot opt source destination
Network :
[root@cluster init.d]# route
Kernel IP routing table
Destination Gateway Genmask Flags Metric Ref Use
Iface
192.168.212.0 * 255.255.255.0 U 0 0 0 eth1
192.168.1.0 * 255.255.255.0 U 0 0 0 eth0
192.168.0.0 * 255.255.255.0 U 0 0 0 eth0
169.254.0.0 * 255.255.0.0 U 0 0 0 eth1
default 192.168.212.1 0.0.0.0 UG 0 0 0 eth1
I have no more idea about how to send informations from nodes to frontend...
Any help would be appreciate.
Thanks,
Regards.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: https://lists.sdsc.edu/pipermail/npaci-rocks-discussion/attachments/20100113/dda5f978/attachment.html
2010/1/13 remy d1 <rem...@gmail.com>
(it's in /etc/sysconfig/static-routes, multicast is very important for
gmond...)
It's working... I do not know exactly why, because it was not effective
immediately after.
Regards.
2010/1/13 remy d1 <rem...@gmail.com>